In [1]:
import pathlib
from argparse import ArgumentParser

import numpy as np
import pandas as pd
import torch as th
import sklearn as sk

from pytorch_lightning import seed_everything, Trainer

from torch_factorization_models.implicit_mf import ImplicitMatrixFactorization
from torch_factorization_models.movielens import MovielensDataset

### Load the dataset

In [2]:
movielens = MovielensDataset("/home/karl/Projects/datasets/ml-20m/")

In [3]:
preprocessor = movielens.preprocessor

In [4]:
user_xformer = preprocessor.named_transformers_['user_id']
item_xformer = preprocessor.named_transformers_['item_id']

### Set up the model and the splits

In [5]:
seed_everything(42)  # same seed used to create splits in training

42

In [6]:
# TODO: Figure out how to set the number of users and items

In [7]:
parser = ArgumentParser(add_help=False)
parser = Trainer.add_argparse_args(parser)
parser = ImplicitMatrixFactorization.add_model_specific_args(parser)

args = parser.parse_args(args=[])
args.num_users = 138287
args.num_items = 20720
# args.use_biases = False
args.embedding_dim = 32

args

Namespace(accumulate_grad_batches=1, amp_backend='native', amp_level='O2', auto_lr_find=False, auto_scale_batch_size=False, auto_select_gpus=False, benchmark=False, beta1=0.9, beta2=0.999, check_val_every_n_epoch=1, checkpoint_callback=True, default_root_dir=None, deterministic=False, distributed_backend=None, early_stop_callback=False, embedding_dim=32, fast_dev_run=False, gpus=<function Trainer._gpus_arg_default at 0x7fb30798ae50>, gradient_clip_val=0, limit_test_batches=1.0, limit_train_batches=1.0, limit_val_batches=1.0, log_gpu_memory=None, log_save_interval=100, logger=True, loss='hinge', max_epochs=1000, max_steps=None, min_epochs=1, min_steps=None, momentum=0.9, num_items=20720, num_nodes=1, num_processes=1, num_sanity_val_steps=2, num_users=138287, overfit_batches=0.0, overfit_pct=None, precision=32, prepare_data_per_node=True, process_position=0, profiler=None, progress_bar_refresh_rate=1, reload_dataloaders_every_epoch=False, replace_sampler_ddp=True, resume_from_checkpoint=

In [9]:
model = ImplicitMatrixFactorization(args)

state_dict = th.load("../models/38ov3g28-honest-lake-213.pt")

# preprocessor = state_dict['preprocessor']
del state_dict['preprocessor']

model.load_state_dict(state_dict)

<All keys matched successfully>

In [10]:
# user_encoder = sk.preprocessing.OrdinalEncoder()
# print(preprocessor)
# params = preprocessor['transformers'][0][1]
# print(params.categories_)
# user_encoder.set_params(**params)
# user_encoder

### Load movie data

In [11]:
movies_df = pd.read_csv("/home/karl/Projects/datasets/ml-20m/movies.csv")

movies_df[:10]

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
5,6,Heat (1995),Action|Crime|Thriller
6,7,Sabrina (1995),Comedy|Romance
7,8,Tom and Huck (1995),Adventure|Children
8,9,Sudden Death (1995),Action
9,10,GoldenEye (1995),Action|Adventure|Thriller


### Define a function to retrieve similar movies

In [12]:
# model.use_biases = False

In [13]:
# def fetch_similar(df, movieId, k):
#     movieIds = th.tensor(item_xformer.transform([[movieId], [10]])).flatten()
#     print(movieIds)
#     topk_scores, topk_indices = model.similar_to_items(movieIds)
#     print(topk_scores, topk_indices)
#     similarIds = item_xformer.inverse_transform(topk_indices.t()[:,0:1].numpy()).flatten()
#     print(similarIds.flatten())
#     return df[df['movieId'].isin(similarIds)]
# #     return scores
# #     return similarIds

def fetch_similar(df, movieId, k):
    movieIds = th.tensor(item_xformer.transform([[movieId], [1]])).flatten()
#     print(movieIds)
    topk_scores, topk_indices = model.similar_to_items(movieIds, k=k)
    print(topk_scores)
    print(topk_indices)
#     similar_ids = topk_indices.t()[:,0:1].numpy().flatten()
    ids_to_transform = topk_indices.t()[:,0:1].numpy()
    print(ids_to_transform)
    similar_ids = item_xformer.inverse_transform(ids_to_transform).flatten()
    print(similar_ids.flatten())
#     return df.iloc[similar_ids, :]
    return df[df['movieId'].isin(similar_ids)]
#     return scores
#     return similarIds

### Try it out!

In [14]:
# A new hope - 260
# model.use_biases = False
model.use_biases = False
fetch_similar(movies_df, 3328, 50)

tensor([[1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000,
         1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000,
         1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000,
         1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000,
         1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000,
         1.0000, 1.0000, 1.0000, 1.0000, 1.0000],
        [0.9999, 0.9997, 0.9997, 0.9997, 0.9997, 0.9996, 0.9996, 0.9995, 0.9994,
         0.9994, 0.9994, 0.9994, 0.9993, 0.9993, 0.9993, 0.9992, 0.9992, 0.9991,
         0.9991, 0.9991, 0.9991, 0.9990, 0.9990, 0.9989, 0.9988, 0.9988, 0.9988,
         0.9987, 0.9987, 0.9987, 0.9987, 0.9987, 0.9984, 0.9984, 0.9984, 0.9984,
         0.9983, 0.9982, 0.9982, 0.9981, 0.9981, 0.9981, 0.9980, 0.9980, 0.9979,
         0.9979, 0.9978, 0.9977, 0.9976, 0.9976]])
tensor([[ 3863,  1566, 12104,  4525,  1817,  8980,  4175,  1702,  8862,  4887,
          

Unnamed: 0,movieId,title,genres
1568,1622,Kicked in the Head (1997),Comedy|Drama
1704,1773,Tokyo Fist (Tokyo ken) (1995),Action|Drama
1711,1782,Little City (1998),Comedy|Romance
1732,1809,Fireworks (Hana-bi) (1997),Crime|Drama
1819,1903,Hav Plenty (1997),Comedy
2219,2304,Love Is the Devil (1998),Drama
2458,2543,Six Ways to Sunday (1997),Comedy
2679,2765,"Acid House, The (1998)",Comedy|Drama
3093,3180,Play it to the Bone (1999),Comedy|Drama
3158,3245,I Am Cuba (Soy Cuba/Ya Kuba) (1964),Drama


### Tests

In [15]:
model.use_biases = False
query_ids = th.tensor(list(range(2)))
scores, ids = model.similar_to_items(query_ids, k=3720)

# print(ids[:, 0])
# assert (ids[:, 0] == query_ids).all()

print(scores)
print(ids)

tensor([[9.9986e-01, 9.9973e-01, 9.9972e-01,  ..., 4.6388e-04, 4.6260e-04,
         4.6259e-04],
        [9.9999e-01, 9.9999e-01, 9.9999e-01,  ..., 2.9181e-01, 2.9150e-01,
         2.9123e-01]])
tensor([[    0,   148,   293,  ..., 11753,  2404,  4820],
        [10046,     1, 12666,  ...,  2556,  1698, 14795]])


In [16]:
model.use_biases = True
query_ids = th.tensor(list(range(100)))
scores, ids = model.similar_to_items(query_ids, k=5)

# print(ids[:, 0])
# assert (ids[:, 0] == query_ids).all()

print(scores)
print(ids)

tensor([[0.9999, 0.9997, 0.9997, 0.9997, 0.9997],
        [1.0000, 1.0000, 1.0000, 1.0000, 1.0000],
        [1.0000, 1.0000, 1.0000, 1.0000, 1.0000],
        [1.0000, 1.0000, 1.0000, 1.0000, 1.0000],
        [1.0000, 1.0000, 1.0000, 1.0000, 1.0000],
        [1.0000, 0.9995, 0.9995, 0.9995, 0.9995],
        [1.0000, 1.0000, 1.0000, 1.0000, 1.0000],
        [1.0000, 1.0000, 1.0000, 1.0000, 1.0000],
        [1.0000, 1.0000, 1.0000, 1.0000, 1.0000],
        [1.0000, 0.9999, 0.9999, 0.9998, 0.9998],
        [1.0000, 1.0000, 1.0000, 1.0000, 1.0000],
        [1.0000, 1.0000, 1.0000, 1.0000, 1.0000],
        [1.0000, 1.0000, 1.0000, 1.0000, 1.0000],
        [1.0000, 1.0000, 1.0000, 1.0000, 1.0000],
        [1.0000, 1.0000, 1.0000, 1.0000, 1.0000],
        [1.0000, 0.9999, 0.9998, 0.9997, 0.9997],
        [1.0000, 1.0000, 1.0000, 1.0000, 1.0000],
        [1.0000, 1.0000, 1.0000, 1.0000, 1.0000],
        [1.0000, 1.0000, 1.0000, 1.0000, 1.0000],
        [1.0000, 1.0000, 1.0000, 1.0000, 1.0000],
