### Import Packages

In [10]:
import random
import json
import numpy as np
import pandas as pd
import pytorch_lightning as pl
import torch
from pytorch_lightning.callbacks import ModelCheckpoint
from pytorch_lightning.loggers import TensorBoardLogger
from torch.utils.data import DataLoader

from recommender.models import Recommender
from recommender.data_processing import get_context, pad_list, map_column, MASK, PAD

### Check GPU is available

In [11]:
import torch
tensor = torch.rand(3,4)
print(f"Device tensor is stored on: {tensor.device}")
# Device tensor is stored on: cpu

print(torch.cuda.is_available())
#True

tensor = tensor.to('cuda')
print(f"Device tensor is stored on: {tensor.device}")
# Device tensor is stored on: cuda:0

Device tensor is stored on: cpu
True
Device tensor is stored on: cuda:0


### Data processing

In [12]:
# 檔案路徑
# data_csv_path = "./netflix_prize_dataset/netflix_data_25M_drop.csv"
# movies_path = "./netflix_prize_dataset/netflix_movie2.csv"
data_csv_path = './disney_review_dataset/disney_data_drop.csv'
movies_path = './disney_review_dataset/disney_movie.csv'
# 檢查是否是最新的check point
model_path = "./recommender_models/recommender-v4.ckpt" 
data = pd.read_csv(data_csv_path)
# data.drop(data[data.movieId == 6484].index, inplace=True)
movies = pd.read_csv(movies_path)
# data.sort_values(by="date", inplace=True) # Netflix movie
data.sort_values(by="timestamp", inplace=True)

In [13]:
data, mapping, inverse_mapping = map_column(data, col_name="movieId")
grp_by_train = data.groupby(by="userId")

In [14]:
random.sample(list(grp_by_train.groups), k=10) # Test

[479, 3803, 874, 4126, 434, 474, 4829, 1046, 3796, 882]

### Model Loading

In [15]:
model = Recommender(
        # vocab_size=len(data) + 2,
        vocab_size = len(movies) + 2,
        lr=1e-4,
        dropout=0.3,
    )
model.eval()
model.load_state_dict(torch.load(model_path)["state_dict"])

<All keys matched successfully>

In [17]:
# movie_to_idx = {a: mapping[b] for a, b in zip(movies.title.tolist(), movies.movieId.tolist()) if b in mapping} # old
# movie_to_idx = {a: b for a, b in zip(movies.title.tolist(), movies.movieId.tolist())}
movie_to_idx = {}
for a, b in zip(movies.movie.tolist(), movies.movieId.tolist()):
    if a in movie_to_idx:
        movie_to_idx[f'{a}_{b}'] = b
    else:
        movie_to_idx[a] = b
idx_to_movie = {v: k for k, v in movie_to_idx.items()}

### Predict function

In [18]:
def NormalizeData(data):
    return (data - np.min(data)) / (np.max(data) - np.min(data)) * 5

def predict(list_movies, model, movie_to_idx, idx_to_movie):

    if type(list_movies[0]) == str:
        ids = [PAD] * (120 - len(list_movies) - 1) + [movie_to_idx[a] for a in list_movies] + [MASK]
    else:
        ids = [PAD] * (120 - len(list_movies) - 1) + list_movies + [MASK]
    
    src = torch.tensor(ids, dtype=torch.long).unsqueeze(0)

    with torch.no_grad():
        prediction = model(src)
    
    masked_pred = prediction[0, -1].numpy()

    masked_pred = NormalizeData(masked_pred) # let the result's value in 1~5

    sorted_predicted_ids = np.argsort(masked_pred).tolist()[::-1]
    
    sorted_predicted_ids = [a for a in sorted_predicted_ids if a not in ids]
    
    return [idx_to_movie[a] for a in sorted_predicted_ids[:30] if a in idx_to_movie], [masked_pred[ids] for ids in sorted_predicted_ids[:30]], [masked_pred[i] for i in [movie_to_idx[a] for a in list_movies]]


### Output Results (old)

#### Senario 1: Adventure/Fantasy 

In [None]:
# list_movies = ["Harry Potter and the Sorcerer's Stone (a.k.a. Harry Potter and the Philosopher's Stone) (2001)",
            #    "Harry Potter and the Chamber of Secrets (2002)",
            #    "Harry Potter and the Prisoner of Azkaban (2004)",
            #    "Harry Potter and the Goblet of Fire (2005)"]
# list_movies=["Harry Potter and the Prisoner of Azkaban: Bonus Material",
#             "Discovering the Real World of Harry Potter",
#             "Harry Potter and the Chamber of Secrets",
#             "Harry Potter and the Prisoner of Azkaban"]
# top_movie = predict(list_movies, model, movie_to_idx, idx_to_movie)
# top_movie

#### Senario 2:  Action/Adventure

In [None]:
# list_movies = ["Black Panther (2017)",
#                "Avengers, The (2012)",
#                "Avengers: Infinity War - Part I (2018)",
#                "Logan (2017)",
#                "Spider-Man (2002)",
#                "Spider-Man 3 (2007)",
#                "Spider-Man: Far from Home (2019)"]
# list_movies = ["Spider-Man: The Return of the Green Goblin",
#                "Spider-Man",
#                "Spider-Man: The Venom Saga",
#                "Spider-Man 2: Bonus Material",
#                "X-Men: Evolution: Season 2"
# ]
# top_movie = predict(list_movies, model, movie_to_idx, idx_to_movie)
# top_movie

#### Senario 3: Comedy

In [None]:
# list_movies = ["Zootopia (2016)",
#                "Toy Story 3 (2010)",
#                "Toy Story 4 (2019)",
#                "Finding Nemo (2003)",
#                "Ratatouille (2007)",
#                "The Lego Movie (2014)",
#                "Ghostbusters (a.k.a. Ghost Busters) (1984)",
#                "Ace Ventura: When Nature Calls (1995)"]
# top_movie = predict(list_movies, model, movie_to_idx, idx_to_movie)
# top_movie

### Evaluation metrics

#### Test Random userId

In [None]:
random_userId = random.choice(list(set(data.userId))) # Random choose an user
user_input = data[(data.userId == random_userId)]
if len(user_input) >= 511:
    user_input = user_input.sample(n = 511) # Random choose 511 rows
answer = [a['rating'] for idx, a in user_input.iterrows() if a['movieId'] in idx_to_movie]
user_input = [idx_to_movie[a] for a in user_input.movieId if a in idx_to_movie]
top_movie, test, evaluation = predict(user_input, model, movie_to_idx, idx_to_movie)
final_eval = pd.DataFrame(data = user_input,  columns = ['Movie'])
final_eval['answer'] = answer
final_eval['eval'] = evaluation
final_eval.sort_values(by = ['eval'], ascending = False, inplace = True)
count, relevant = 0, 0
for idx, elm in final_eval.iterrows():
    if elm['answer'] >= 3.5: relevant += 1
    count += 1
    print(relevant/count, count)
    if count == 10: break

In [None]:
top_movie

In [None]:
len(final_eval[final_eval.answer >= 3.5])
len(data.userId.unique())
data.userId.unique()[:20]
data[data.userId == 371560]

#### Test all userId

In [19]:
# k = 0 (useless) 1~10
precision_score = [0]*11 
recall_score = [0]*11
for userId in data.userId.unique():
    user_input = data[(data.userId == userId)]
    print(len(user_input))
    if len(user_input) >= 511:
        user_input = user_input.sample(n = 511) # Random choose 511 rows
    answer = [a['rating'] for idx, a in user_input.iterrows() if a['movieId'] in idx_to_movie]
    user_input = [idx_to_movie[a] for a in user_input.movieId if a in idx_to_movie]
    top_movie, test, evaluation = predict(user_input, model, movie_to_idx, idx_to_movie)
    final_eval = pd.DataFrame(data = user_input,  columns = ['Movie'])
    final_eval['answer'] = answer
    final_eval['eval'] = evaluation
    final_eval.sort_values(by = ['eval'], ascending = False, inplace = True)
    recommend_item, relevant_item, relevant_recommend = 0, len(final_eval[final_eval.answer >= 3.5]), 0 # Precision分母 Recall分母 分子兩個一樣
    for idx, elm in final_eval.iterrows():
        if elm['answer'] >= 3.5: relevant_recommend += 1
        recommend_item += 1
        precision_score[recommend_item] += relevant_recommend / recommend_item
        if relevant_item>0:
            recall_score[recommend_item] += relevant_recommend / relevant_item
        if recommend_item == 10: break
    print(userId, 'is done!')

2
466 is done!
2
303 is done!
3
64 is done!
3
314 is done!
5
80 is done!
6
885 is done!
2
645 is done!
2
2564 is done!
15
101 is done!
2
649 is done!
2
1264 is done!
2
1485 is done!
2
1574 is done!
3
1561 is done!
2
2227 is done!
2
761 is done!
2
684 is done!
2
1077 is done!
2
2597 is done!
11
266 is done!
3
830 is done!
4
394 is done!
7
1086 is done!
2
2834 is done!
3
1619 is done!
2
1090 is done!
2
1248 is done!
2
1182 is done!
2
973 is done!
2
836 is done!
2
301 is done!
2
1594 is done!
2
1509 is done!
2
1491 is done!
3
1564 is done!
3
1055 is done!
8
214 is done!
7
401 is done!
4
232 is done!
2
491 is done!
2
407 is done!
2
2844 is done!
3
2860 is done!
13
127 is done!
8
785 is done!
2
2579 is done!
2
1075 is done!
3
2467 is done!
5
2409 is done!
2
2487 is done!
5
247 is done!
5
501 is done!
4
1056 is done!
8
264 is done!
2
82 is done!
2
451 is done!
2
695 is done!
2
2774 is done!
2
474 is done!
3
2830 is done!
2
817 is done!
2
260 is done!
2
495 is done!
2
2833 is done!
7
261 is d

#### Precision Score

In [20]:
for i in range(1,11):
    print("k = ", i, precision_score[i] / len(data.userId.unique())) # total value / total number of userId

k =  1 0.7432762836185819
k =  2 0.7396088019559902
k =  3 0.3512632436837815
k =  4 0.2169926650366748
k =  5 0.16894865525672365
k =  6 0.13121434392828032
k =  7 0.10548375829549421
k =  8 0.08236552567237164
k =  9 0.06886715566422166
k =  10 0.05745721271393641


#### Recall Score

In [21]:
for i in range(1,11):
    print("k = ", i, recall_score[i] / len(data.userId.unique())) # total value / total number of userId

k =  1 0.3275077803446853
k =  2 0.642223975067712
k =  3 0.2972655330989861
k =  4 0.17252626682009675
k =  5 0.14447773340247802
k =  6 0.11570269833159956
k =  7 0.09611806925196825
k =  8 0.07831879775465278
k =  9 0.0673223607496583
k =  10 0.05941080628625573


#### F1 Score

In [44]:
for i in range(1,11):
    prec_score = precision_score[i] / len(data.userId.unique())
    reca_score = recall_score[i] / len(data.userId.unique())
    print("k = ", i, (prec_score*reca_score/(prec_score+reca_score))*2) # total value / total number of userId

k =  1 0.4546738675392151
k =  2 0.6874847849684445
k =  3 0.3220164135498788
k =  4 0.19222138573684022
k =  5 0.15575790460597816
k =  6 0.12297129038451456
k =  7 0.10058336581713005
k =  8 0.08029120463664627
k =  9 0.06808599691829274
k =  10 0.05841768113294115


In [31]:
len(data.userId.unique())
#data.userId.max()

818