### Import Packages

In [None]:
import random
import json
import numpy as np
import pandas as pd
import pytorch_lightning as pl
import torch
from pytorch_lightning.callbacks import ModelCheckpoint
from pytorch_lightning.loggers import TensorBoardLogger
from torch.utils.data import DataLoader

from recommender.models import Recommender
from recommender.data_processing import get_context, pad_list, map_column, MASK, PAD

### Check GPU is available

In [None]:
import torch
tensor = torch.rand(3,4)
print(f"Device tensor is stored on: {tensor.device}")
# Device tensor is stored on: cpu

print(torch.cuda.is_available())
#True

tensor = tensor.to('cuda')
print(f"Device tensor is stored on: {tensor.device}")
# Device tensor is stored on: cuda:0

### Data processing

In [None]:
# 檔案路徑
data_csv_path = "./netflix_prize_dataset/netflix_data_25M_drop.csv"
movies_path = "./netflix_prize_dataset/netflix_movie2.csv"
# 檢查是否是最新的check point
model_path = "./recommender_models/recommender-v1.ckpt" 
data = pd.read_csv(data_csv_path)
data.drop(data[data.movieId == 6484].index, inplace=True)
movies = pd.read_csv(movies_path)
data.sort_values(by="date", inplace=True)

In [None]:
data, mapping, inverse_mapping = map_column(data, col_name="movieId")
grp_by_train = data.groupby(by="userId")

In [None]:
random.sample(list(grp_by_train.groups), k=10) # Test

### Model Loading

In [None]:
model = Recommender(
        # vocab_size=len(data) + 2,
        vocab_size = len(movies) + 2,
        lr=1e-4,
        dropout=0.3,
    )
model.eval()
model.load_state_dict(torch.load(model_path)["state_dict"])

In [None]:
# movie_to_idx = {a: mapping[b] for a, b in zip(movies.title.tolist(), movies.movieId.tolist()) if b in mapping} # old
# movie_to_idx = {a: b for a, b in zip(movies.title.tolist(), movies.movieId.tolist())}
movie_to_idx = {}
for a, b in zip(movies.title.tolist(), movies.movieId.tolist()):
    if a in movie_to_idx:
        movie_to_idx[f'{a}_{b}'] = b
    else:
        movie_to_idx[a] = b
idx_to_movie = {v: k for k, v in movie_to_idx.items()}

### Predict function

In [None]:
def NormalizeData(data):
    return (data - np.min(data)) / (np.max(data) - np.min(data)) * 5

def predict(list_movies, model, movie_to_idx, idx_to_movie):

    if type(list_movies[0]) == str:
        ids = [PAD] * (120 - len(list_movies) - 1) + [movie_to_idx[a] for a in list_movies] + [MASK]
    else:
        ids = [PAD] * (120 - len(list_movies) - 1) + list_movies + [MASK]
    
    src = torch.tensor(ids, dtype=torch.long).unsqueeze(0)

    with torch.no_grad():
        prediction = model(src)
    
    masked_pred = prediction[0, -1].numpy()

    masked_pred = NormalizeData(masked_pred) # let the result's value in 1~5

    sorted_predicted_ids = np.argsort(masked_pred).tolist()[::-1]
    
    sorted_predicted_ids = [a for a in sorted_predicted_ids if a not in ids]
    
    return [idx_to_movie[a] for a in sorted_predicted_ids[:30] if a in idx_to_movie], [masked_pred[ids] for ids in sorted_predicted_ids[:30]], [masked_pred[i] for i in [movie_to_idx[a] for a in list_movies]]


### Output Results (old)

#### Senario 1: Adventure/Fantasy 

In [None]:
# list_movies = ["Harry Potter and the Sorcerer's Stone (a.k.a. Harry Potter and the Philosopher's Stone) (2001)",
            #    "Harry Potter and the Chamber of Secrets (2002)",
            #    "Harry Potter and the Prisoner of Azkaban (2004)",
            #    "Harry Potter and the Goblet of Fire (2005)"]
list_movies=["Harry Potter and the Prisoner of Azkaban: Bonus Material",
            "Discovering the Real World of Harry Potter",
            "Harry Potter and the Chamber of Secrets",
            "Harry Potter and the Prisoner of Azkaban"]
top_movie = predict(list_movies, model, movie_to_idx, idx_to_movie)
top_movie

#### Senario 2:  Action/Adventure

In [None]:
# list_movies = ["Black Panther (2017)",
#                "Avengers, The (2012)",
#                "Avengers: Infinity War - Part I (2018)",
#                "Logan (2017)",
#                "Spider-Man (2002)",
#                "Spider-Man 3 (2007)",
#                "Spider-Man: Far from Home (2019)"]
list_movies = ["Spider-Man: The Return of the Green Goblin",
               "Spider-Man",
               "Spider-Man: The Venom Saga",
               "Spider-Man 2: Bonus Material",
               "X-Men: Evolution: Season 2"
]
top_movie = predict(list_movies, model, movie_to_idx, idx_to_movie)
top_movie

#### Senario 3: Comedy

In [None]:
# list_movies = ["Zootopia (2016)",
#                "Toy Story 3 (2010)",
#                "Toy Story 4 (2019)",
#                "Finding Nemo (2003)",
#                "Ratatouille (2007)",
#                "The Lego Movie (2014)",
#                "Ghostbusters (a.k.a. Ghost Busters) (1984)",
#                "Ace Ventura: When Nature Calls (1995)"]
# top_movie = predict(list_movies, model, movie_to_idx, idx_to_movie)
# top_movie

### Evaluation metrics

#### Test Random userId

In [None]:
random_userId = random.choice(list(set(data.userId))) # Random choose an user
user_input = data[(data.userId == random_userId)]
if len(user_input) >= 511:
    user_input = user_input.sample(n = 511) # Random choose 511 rows
answer = [a['rating'] for idx, a in user_input.iterrows() if a['movieId'] in idx_to_movie]
user_input = [idx_to_movie[a] for a in user_input.movieId if a in idx_to_movie]
top_movie, test, evaluation = predict(user_input, model, movie_to_idx, idx_to_movie)
final_eval = pd.DataFrame(data = user_input,  columns = ['Movie'])
final_eval['answer'] = answer
final_eval['eval'] = evaluation
final_eval.sort_values(by = ['eval'], ascending = False, inplace = True)
count, relevant = 0, 0
for idx, elm in final_eval.iterrows():
    if elm['answer'] >= 3.5: relevant += 1
    count += 1
    print(relevant/count, count)
    if count == 10: break

In [None]:
top_movie

In [None]:
len(final_eval[final_eval.answer >= 3.5])
len(data.userId.unique())
data.userId.unique()[:20]
data[data.userId == 371560]

#### Test all userId

In [None]:
# k = 0 (useless) 1~10
precision_score = [0]*11 
recall_score = [0]*11
for userId in data.userId.unique():
    user_input = data[(data.userId == userId)]
    print(len(user_input))
    if len(user_input) >= 511:
        user_input = user_input.sample(n = 511) # Random choose 511 rows
    answer = [a['rating'] for idx, a in user_input.iterrows() if a['movieId'] in idx_to_movie]
    user_input = [idx_to_movie[a] for a in user_input.movieId if a in idx_to_movie]
    top_movie, test, evaluation = predict(user_input, model, movie_to_idx, idx_to_movie)
    final_eval = pd.DataFrame(data = user_input,  columns = ['Movie'])
    final_eval['answer'] = answer
    final_eval['eval'] = evaluation
    final_eval.sort_values(by = ['eval'], ascending = False, inplace = True)
    recommend_item, relevant_item, relevant_recommend = 0, len(final_eval[final_eval.answer >= 3.5]), 0 # Precision分母 Recall分母 分子兩個一樣
    for idx, elm in final_eval.iterrows():
        if elm['answer'] >= 3.5: relevant_recommend += 1
        recommend_item += 1
        precision_score[recommend_item] += relevant_recommend / recommend_item
        if relevant_item>0:
            recall_score[recommend_item] += relevant_recommend / relevant_item
        if recommend_item == 10: break
    print(userId, 'is done!')

#### Precision Score

In [None]:
for i in range(1,11):
    print("k = ", i, precision_score[i] / len(data.userId.unique())) # total value / total number of userId

#### Recall Score

In [None]:
for i in range(1,11):
    print("k = ", i, recall_score[i] / len(data.userId.unique())) # total value / total number of userId

#### F1 Score

In [None]:
for i in range(1,11):
    prec_score = precision_score[i] / len(data.userId.unique())
    reca_score = recall_score[i] / len(data.userId.unique())
    print("k = ", i, (prec_score*recall_score/(prec_score+recall_score))*2) # total value / total number of userId

In [None]:
count / (len(data.userId.unique())*30) #old

In [None]:
len(data.userId.unique())