### Import Packages

In [373]:
import random
import json
import numpy as np
import pandas as pd
import pytorch_lightning as pl
import torch
from pytorch_lightning.callbacks import ModelCheckpoint
from pytorch_lightning.loggers import TensorBoardLogger
from torch.utils.data import DataLoader

from recommender.models import Recommender
from recommender.data_processing import get_context, pad_list, map_column, MASK, PAD

### Check GPU is available

In [374]:
import torch
tensor = torch.rand(3,4)
print(f"Device tensor is stored on: {tensor.device}")
# Device tensor is stored on: cpu

print(torch.cuda.is_available())
#True

tensor = tensor.to('cuda')
print(f"Device tensor is stored on: {tensor.device}")
# Device tensor is stored on: cuda:0

Device tensor is stored on: cpu
True
Device tensor is stored on: cuda:0


### Data processing

In [375]:
# 檔案路徑
data_csv_path = "./Data/disney_netflix/220604/disney_netflix_data.csv"
movies_path = "./Data/disney_netflix/220604/disney_netflix_movie.csv"
# 檢查是否是最新的check point
model_path = "./recommender_models/disney_netflix_100M/disney_netflix.ckpt" 
data = pd.read_csv(data_csv_path)
data.drop(data[data.movieId == 6484].index, inplace=True)
movies = pd.read_csv(movies_path)
data.sort_values(by="timestamp", inplace=True)

In [376]:
data, mapping, inverse_mapping = map_column(data, col_name="movieId")
grp_by_train = data.groupby(by="userId")

In [377]:
random.sample(list(grp_by_train.groups), k=10) # Test

[7402, 13017, 11711, 10172, 6491, 17433, 22511, 21857, 2518, 10121]

### Model Loading

In [378]:
model = Recommender(
        # vocab_size=len(data) + 2,
        vocab_size = len(movies) + 2,
        lr=1e-4,
        dropout=0.3,
    )
model.eval()
model.load_state_dict(torch.load(model_path)["state_dict"])

<All keys matched successfully>

In [379]:
# movie_to_idx = {a: mapping[b] for a, b in zip(movies.title.tolist(), movies.movieId.tolist()) if b in mapping} # old
movie_to_idx = {a: b for a, b in zip(movies.title.tolist(), movies.movieId.tolist())}
idx_to_movie = {v: k for k, v in movie_to_idx.items()}

### Predict function

In [380]:
def NormalizeData(data):
    return (data - np.min(data)) / (np.max(data) - np.min(data)) * 5

def predict(list_movies, model, movie_to_idx, idx_to_movie):

    if type(list_movies[0]) == str:
        ids = [PAD] * (120 - len(list_movies) - 1) + [movie_to_idx[a] for a in list_movies] + [MASK]
    else:
        ids = [PAD] * (120 - len(list_movies) - 1) + list_movies + [MASK]
    
    src = torch.tensor(ids, dtype=torch.long).unsqueeze(0)
    
    with torch.no_grad():
        prediction = model(src)
    
    masked_pred = prediction[0, -1].numpy()

    masked_pred = NormalizeData(masked_pred) # let the result's value in 1~5

    sorted_predicted_ids = np.argsort(masked_pred).tolist()[::-1]
    
    sorted_predicted_ids = [a for a in sorted_predicted_ids if a not in ids]
    
    return [idx_to_movie[a] for a in sorted_predicted_ids[:30] if a in idx_to_movie], [masked_pred[ids] for ids in sorted_predicted_ids[:30]], [masked_pred[i] for i in [movie_to_idx[a] for a in list_movies]]


### Output Results (old)

#### Senario 1: Adventure/Fantasy 

In [381]:
# list_movies = ["Harry Potter and the Sorcerer's Stone (a.k.a. Harry Potter and the Philosopher's Stone) (2001)",
            #    "Harry Potter and the Chamber of Secrets (2002)",
            #    "Harry Potter and the Prisoner of Azkaban (2004)",
            #    "Harry Potter and the Goblet of Fire (2005)"]
list_movies=["Harry Potter and the Prisoner of Azkaban: Bonus Material",
            "Discovering the Real World of Harry Potter",
            "Harry Potter and the Chamber of Secrets",
            "Harry Potter and the Prisoner of Azkaban"]
top_movie = predict(list_movies, model, movie_to_idx, idx_to_movie)
top_movie

(['Isle of Man TT 2004 Review',
  'Dinosaur Planet',
  'Brave',
  'Cenerentola',
  'La Spada nella Roccia',
  'Lilo & Stitch',
  'Aladdin',
  'The Lion King',
  'Reconstruction',
  'A Century of Science Fiction',
  'Dumb and Dumberer: When Harry Met Lloyd',
  'The Legend of Sleepy Hollow',
  'Dandelion Dead',
  'Hello Kitty & Friends',
  'Tiger Claws III: The Final Conflict',
  'Saturday Night Live: The Best of John Belushi',
  "Joggers' Park"],
 [5.0,
  4.987718,
  4.9097824,
  4.863618,
  4.8632946,
  4.799189,
  4.7235246,
  4.6782923,
  4.6704617,
  4.6181765,
  4.574536,
  4.559668,
  4.552233,
  4.530247,
  4.5282993,
  4.5164337,
  4.2713766,
  4.2427034,
  4.1442056,
  4.0905013,
  3.1916127,
  3.1867213,
  3.1592107,
  3.124918,
  2.9660523,
  2.9469726,
  2.9379938,
  2.913883,
  2.9079363,
  2.8601053],
 [1.1228826, 2.8005075, 1.4882796, 1.615702])

#### Senario 2:  Action/Adventure

In [382]:
# list_movies = ["Black Panther (2017)",
#                "Avengers, The (2012)",
#                "Avengers: Infinity War - Part I (2018)",
#                "Logan (2017)",
#                "Spider-Man (2002)",
#                "Spider-Man 3 (2007)",
#                "Spider-Man: Far from Home (2019)"]
list_movies = ["Spider-Man: The Return of the Green Goblin",
               "Spider-Man",
               "Spider-Man: The Venom Saga",
               "Spider-Man 2: Bonus Material",
               "X-Men: Evolution: Season 2"
]
top_movie = predict(list_movies, model, movie_to_idx, idx_to_movie)
top_movie

(['Reconstruction',
  'A Century of Science Fiction',
  'An Everlasting Piece',
  'Pray for Power',
  'A.D. Police Files 1-3',
  'Mary Tyler Moore: Season 1',
  'High Spirits',
  'Hello Kitty & Friends',
  'Saturday Night Live: The Best of John Belushi',
  'Legend of the Dragon Kings: Blue Dragon',
  'Twitch of the Death Nerve',
  'The Very Thought of You',
  'Hot Shot',
  'Extreme Limits',
  'The Legend of Sleepy Hollow',
  'Changing Hearts',
  'Sliders: Season 3',
  'Lost in Translation',
  'Shadow of Doubt',
  'Lenny Bruce Without Tears',
  "He Knows You're Alone",
  'Invasion!',
  'Camille',
  'Dandelion Dead',
  'Dark Waters',
  'Kibakichi',
  'The Adventures of Buckaroo Banzai',
  'The Shaolin Invincibles',
  'Zardoz',
  'Frogs'],
 [5.0,
  4.997172,
  4.995948,
  4.8422375,
  4.8324256,
  4.8261247,
  4.811482,
  4.80267,
  4.79316,
  4.7860556,
  4.7631235,
  4.7604613,
  4.729977,
  4.72941,
  4.702652,
  4.6998196,
  4.698581,
  4.6896687,
  4.677308,
  4.6756887,
  4.6750975,

#### Senario 3: Comedy

In [383]:
# list_movies = ["Zootopia (2016)",
#                "Toy Story 3 (2010)",
#                "Toy Story 4 (2019)",
#                "Finding Nemo (2003)",
#                "Ratatouille (2007)",
#                "The Lego Movie (2014)",
#                "Ghostbusters (a.k.a. Ghost Busters) (1984)",
#                "Ace Ventura: When Nature Calls (1995)"]
# top_movie = predict(list_movies, model, movie_to_idx, idx_to_movie)
# top_movie

### Evaluation metrics

#### Test Random userId

In [384]:
random_userId = random.choice(list(set(data.userId))) # Random choose an user
user_input = data[(data.userId == random_userId)]
if len(user_input) >= 511:
    user_input = user_input.sample(n = 511) # Random choose 511 rows
answer = [a['rating'] for idx, a in user_input.iterrows() if a['movieId'] in idx_to_movie]
user_input = [idx_to_movie[a] for a in user_input.movieId if a in idx_to_movie]
top_movie, test, evaluation = predict(user_input, model, movie_to_idx, idx_to_movie)
final_eval = pd.DataFrame(data = user_input,  columns = ['Movie'])
final_eval['answer'] = answer
final_eval['eval'] = evaluation
final_eval.sort_values(by = ['eval'], ascending = False, inplace = True)
count, relevant = 0, 0
for idx, elm in final_eval.iterrows():
    if elm['answer'] >= 3.5: relevant += 1
    count += 1
    print(relevant/count, count)
    if count == 10: break

1.0 1
1.0 2
1.0 3
1.0 4
1.0 5
0.8333333333333334 6
0.7142857142857143 7
0.625 8
0.5555555555555556 9
0.6 10


In [385]:
top_movie

['S.I.C.K.',
 'Mary Tyler Moore: Season 1',
 'Guncrazy',
 'Freddy vs. Jason: Bonus Material',
 'Superbabies: Baby Geniuses 2',
 'Uncorked',
 "Murphy's War",
 'A Man in Uniform',
 'Shadow of Doubt',
 'NFL Films: Ice Bowl / Green Bay Packers History',
 'Suicide Girls: The First Tour',
 'What I Want My Words to Do to You',
 "You're Invited to Mary-Kate and Ashley's Favorite Parties",
 'Beyond the Sea',
 'Jockey',
 "Thomas & Friends: Thomas' Sodor Celebration",
 'Bamboozled',
 'Tanner on Tanner',
 'Thundercats: Season 1: Vol. 1',
 'Today You Die',
 'Tai-Pan',
 'Reap the Wild Wind',
 'Rings',
 'Son Frere',
 'Everything Relative',
 'All the Real Girls',
 'Chi-Hwa-Seon: Painted Fire',
 'Dark Waters',
 'Dennis the Menace Strikes Again']

In [386]:
len(final_eval[final_eval.answer >= 3.5])

320

#### Test all userId

In [387]:
# k = 0 (useless) 1~10
precision_score = [0]*11 
recall_score = [0]*11
for userId in data.userId.unique():
    user_input = data[(data.userId == userId)]
    if len(user_input) > 10:
        if len(user_input) >= 511:
            user_input = user_input.sample(n = 511) # Random choose 511 rows
        answer = [a['rating'] for idx, a in user_input.iterrows() if a['movieId'] in idx_to_movie]
        user_input = [idx_to_movie[a] for a in user_input.movieId if a in idx_to_movie]
        top_movie, test, evaluation = predict(user_input, model, movie_to_idx, idx_to_movie)
        final_eval = pd.DataFrame(data = user_input,  columns = ['Movie'])
        final_eval['answer'] = answer
        final_eval['eval'] = evaluation
        final_eval.sort_values(by = ['eval'], ascending = False, inplace = True)
        recommend_item, relevant_item, relevant_recommend = 0, len(final_eval[final_eval.answer >= 3.5]), 0 # Precision分母 Recall分母 分子兩個一樣
        for idx, elm in final_eval.iterrows():
            if elm['answer'] >= 3.5: relevant_recommend += 1
            recommend_item += 1
            precision_score[recommend_item] += relevant_recommend / recommend_item
            if relevant_item>0:
                recall_score[recommend_item] += relevant_recommend / relevant_item
            if recommend_item == 10: break
        print(userId, 'is done!')

466 is done!
314 is done!
101 is done!
1086 is done!
1485 is done!
1561 is done!
684 is done!
266 is done!
24801 is done!
12275 is done!
22432 is done!
13891 is done!
25375 is done!
10268 is done!
8660 is done!
23418 is done!
23445 is done!
14690 is done!
24577 is done!
17864 is done!
2007 is done!
19915 is done!
4832 is done!
26839 is done!
7751 is done!
15749 is done!
10564 is done!
7921 is done!
26479 is done!
1922 is done!
9592 is done!
3998 is done!
15893 is done!
7577 is done!
18527 is done!
3786 is done!
8174 is done!
14682 is done!
5900 is done!
437 is done!
3694 is done!
24252 is done!
7249 is done!
4783 is done!
6689 is done!
1636 is done!
7548 is done!
25307 is done!
17589 is done!
23316 is done!
7652 is done!
11857 is done!
17656 is done!
15191 is done!
1619 is done!
1333 is done!
11599 is done!
19703 is done!
25623 is done!
11764 is done!
13227 is done!
21242 is done!
1350 is done!
14309 is done!
19290 is done!
4277 is done!
12580 is done!
16782 is done!
7167 is done!
7953

#### Precision Score

In [388]:
for i in range(1,11):
    print("k = ", i, precision_score[i] / len(data.userId.unique())) # total value / total number of userId

k =  1 0.5252903709254403
k =  2 0.5249156987635819
k =  3 0.5237916822780069
k =  4 0.5231360059947546
k =  5 0.5197452229299354
k =  6 0.51970151117772
k =  7 0.5194294278220847
k =  8 0.5210284750843013
k =  9 0.5202739269805601
k =  10 0.5207006369426758


#### Recall Score

In [389]:
for i in range(1,11):
    print("k = ", i, recall_score[i] / len(data.userId.unique())) # total value / total number of userId

k =  1 0.01528795269706515
k =  2 0.03242665836141798
k =  3 0.04926228511274295
k =  4 0.06574070894093077
k =  5 0.08041882094017283
k =  6 0.09342107027048938
k =  7 0.10602727073039206
k =  8 0.12032097910528547
k =  9 0.13410388607562354
k =  10 0.1486177016234635


#### F1 Score

In [390]:
for i in range(1,11):
    prec_score = precision_score[i] / len(data.userId.unique())
    reca_score = recall_score[i] / len(data.userId.unique())
    print("k = ", i, (prec_score*reca_score/(prec_score+reca_score))*2) # total value / total number of userId

k =  1 0.02971119629480314
k =  2 0.06108009489949518
k =  3 0.09005495698616449
k =  4 0.11680316451425067
k =  5 0.13928624496658035
k =  6 0.15837313080439663
k =  7 0.17610710604422136
k =  8 0.1954960929781823
k =  9 0.21324303495579527
k =  10 0.23123625168279202


In [391]:
count / (len(data.userId.unique())*30) #old

6.244536030972898e-05

In [396]:
print('user: ', len(data.userId.unique()))
print('資料量: ', len(data))

user:  5338
資料量:  1047970
