In [1]:
import faiss
import pathlib
import time
from argparse import ArgumentParser

import numpy as np
import torch as th

from pytorch_lightning import seed_everything, Trainer
from ranking_metrics_torch.precision_recall import precision_at, recall_at
from ranking_metrics_torch.cumulative_gain import ndcg_at
from torch_factorization_models.implicit_mf import ImplicitMatrixFactorization
from torch_factorization_models.movielens import MovielensDataset, MovielensDataModule

from pybloomfilter import BloomFilter

In [2]:
seed_everything(42)  # same seed used to create splits in training

42

### Load dataset

In [3]:
movielens_module = MovielensDataModule("/home/karl/Projects/datasets/ml-20m/")
movielens_module.setup()

In [4]:
movielens = movielens_module.dataset
preprocessor = movielens.preprocessor
user_xformer = preprocessor.named_transformers_['user_id']
item_xformer = preprocessor.named_transformers_['item_id']

### Load the model

In [5]:
parser = ArgumentParser(add_help=False)
parser = Trainer.add_argparse_args(parser)
parser = ImplicitMatrixFactorization.add_model_specific_args(parser)

args = parser.parse_args(args=[])
args.num_users = 138287
args.num_items = 20720
# args.use_biases = False
args.embedding_dim = 32
args.eval_cutoff = th.tensor([100])

args

Namespace(accumulate_grad_batches=1, amp_backend='native', amp_level='O2', auto_lr_find=False, auto_scale_batch_size=False, auto_select_gpus=False, benchmark=False, beta1=0.9, beta2=0.999, check_val_every_n_epoch=1, checkpoint_callback=True, default_root_dir=None, deterministic=False, distributed_backend=None, early_stop_callback=False, embedding_dim=32, eval_cutoff=tensor([100]), fast_dev_run=False, gpus=<function Trainer._gpus_arg_default at 0x7faf9e367e50>, gradient_clip_val=0, learning_rate=0.1, limit_test_batches=1.0, limit_train_batches=1.0, limit_val_batches=1.0, log_gpu_memory=None, log_save_interval=100, logger=True, loss='logistic', max_epochs=1000, max_steps=None, min_epochs=1, min_steps=None, momentum=0.9, num_items=20720, num_nodes=1, num_processes=1, num_sanity_val_steps=2, num_users=138287, optimizer='sgd', overfit_batches=0.0, overfit_pct=None, precision=32, prepare_data_per_node=True, process_position=0, profiler=None, progress_bar_refresh_rate=1, reload_dataloaders_ev

In [6]:
model = ImplicitMatrixFactorization(args)

state_dict = th.load("../models/38ov3g28-honest-lake-213.pt")

# preprocessor = state_dict['preprocessor']
del state_dict['preprocessor']
state_dict['global_bias_idx'] = th.LongTensor([0])

model.load_state_dict(state_dict)

<All keys matched successfully>

In [7]:
if th.cuda.is_available():
    model.cuda()
    
movielens_module.dataset.to_(device=model.device)

In [8]:
val_dataloader = movielens_module.val_dataloader(by_user=True)

In [9]:
train_dataloader = movielens_module.train_dataloader(by_user=True)

### Model

In [10]:
model_metrics = model.compute_validation_metrics(
    val_dataloader,
    model.eval_predict,
)

In [11]:
print(f"Precision: {model_metrics['precision']:.4f}")
print(f"Recall: {model_metrics['recall']:.4f}")
print(f"NDCG: {model_metrics['ndcg']:.4f}")

Precision: 0.0261
Recall: 0.5015
NDCG: 0.1874


### ANN Search Indexing

In [12]:
import faiss

dim = model.hparams.embedding_dim
dim

32

In [13]:
item_vectors = np.array(model.item_embeddings.weight.cpu().data)

In [14]:
res = faiss.StandardGpuResources()

flat_config = faiss.GpuIndexFlatConfig()
flat_config.device = 0

# Create an index and add item vectors
start = time.perf_counter()

exact_index = faiss.GpuIndexFlatIP(res, dim, flat_config)  
exact_index.add(item_vectors)

end = time.perf_counter()

elapsed = (end - start) * 1000

print(f"Indexed: {exact_index.ntotal} items")
print(f"Elapsed: {elapsed:.4f} ms")

Indexed: 20720 items
Elapsed: 302.9731 ms


In [15]:
# Create an index for approximate search with product quantization

start = time.perf_counter()
res = faiss.StandardGpuResources()

approx_index = faiss.index_factory(dim, "IVF1024,PQ32", faiss.METRIC_INNER_PRODUCT)
co = faiss.GpuClonerOptions()
# here we are using a 64-byte PQ, so we must set the lookup tables to
# 16 bit float (this is due to the limited temporary memory).
# co.useFloat16 = True

approx_index = faiss.index_cpu_to_gpu(res, 0, approx_index, co)

approx_index.train(item_vectors)
approx_index.add(item_vectors)

approx_index.nprobe = 30

end = time.perf_counter()

elapsed = (end - start) * 1000

print(f"Indexed: {approx_index.ntotal} items")
print(f"Elapsed: {elapsed:.4f} ms")

Indexed: 20720 items
Elapsed: 24866.1954 ms


### Bloom filters

In [16]:
train_dataloader.dataset.num_users

138287

In [17]:
bloom_filters = {}

for user_id in range(train_dataloader.dataset.num_users):
    if user_id % 1000 == 0:
        print(f"\rUser {user_id}", sep=" ", end="", flush=True)
    
    interactions = train_dataloader.dataset[user_id]["interactions"].coalesce()    
    item_ids = interactions.indices()[1]
    
    bloom = BloomFilter(10, 0.1)
    bloom.update(item_ids)
        
    bloom_filters[user_id] = bloom
    
print("\nDone.")

User 138000
Done.


### Pipeline

In [18]:
def fetch_embeddings(model, user_id, item_ids):
    item_embeddings = model.item_embeddings.weight[item_ids.to(dtype=th.long)]
        
    # Use model vector embedding
    user_embedding = model.user_embeddings.weight[user_id].unsqueeze(dim=0)
        
    # Compute user embedding by averaging interacted item embeddings
    user_avg_embedding = th.mean(item_embeddings, dim=0).unsqueeze(dim=0)
    
    return item_embeddings, user_embedding, user_avg_embedding

In [19]:
def user_embedding_candidates(index, user_embedding, k, num_items):
    overfetch = 1.2
    num_candidates = int(overfetch * k)
    
    if not user_embedding.isnan().any():
        neighbor_scores, neighbor_indices = index.search(np.array(user_embedding.cpu()), num_candidates)
    else:
        neighbor_indices = th.randint(num_items, (num_candidates,))
    
    candidates = th.tensor(neighbor_indices).flatten().unique()
    
#     if candidates.shape[0] != k:
#         print(candidates.shape)
    
    return candidates

In [20]:
def filter_candidates(user_id, candidates):
    bloom = bloom_filters[user_id]
    filtered = list(filter(lambda c: c not in bloom, candidates.numpy()))

#     if len(candidates) != len(filtered):
#         print(f"\nFiltered {len(candidates)} down to {len(filtered)}")

    return filtered

In [21]:
def score_candidates(model, user_embedding, candidates, k, num_items):
    candidates = th.tensor(candidates, dtype=th.long)
    
    # Defensively normalize number of scored candidates to k
    if candidates.shape[0] > k:
        candidates = candidates[:k]
    elif candidates.shape[0] < k:
        padding_size = k - candidates.shape[0]
        candidates = th.cat([candidates, th.randint(num_items, (padding_size,))])
    
    item_vectors = model.item_embeddings.weight.squeeze()
    item_biases = model.item_biases.weight.squeeze()

    scores = model._similarity_scores(
        user_embedding, th.empty((1, 1)), item_vectors, item_biases
    ).flatten()
 
    masked_scores = th.empty_like(scores).fill_(-float("inf"))
    masked_scores[candidates] = scores[candidates]
    
    return masked_scores

In [22]:
def dither_scores(scores, k, epsilon=1.5):
    log_ranks = th.log(th.arange(k) + 1.0)
    std_dev = th.sqrt(th.log(th.tensor(epsilon)))
    
    # Compute dithered scores based on item ranks
    dithered_scores = -(log_ranks + th.randn_like(log_ranks) * std_dev)
        
    # Replace raw scores with dithered scores
    _, topk_indices = th.topk(scores, k)
    scores[topk_indices] = dithered_scores.to(device=scores.device)
    
    return scores

In [23]:
def full_pipeline(user_ids, num_items):
    k = 250
    dithering_eps = 1.5
    
    user_scores = []
    for user_id in user_ids:
        if user_id % 1000 == 0:
            print(f"\rUser {user_id}", sep=" ", end="", flush=True)
        
        interactions = train_dataloader.dataset[user_id]["interactions"].coalesce()
        item_ids = interactions.indices()[1]
        
        item_embeddings, user_embedding, user_avg_embedding = \
                fetch_embeddings(model, user_id, item_ids)
        
        candidates = user_embedding_candidates(approx_index, user_avg_embedding, k, args.num_items)
        filtered = filter_candidates(user_id.cpu().item(), candidates)
        raw_scores = score_candidates(model, user_avg_embedding, filtered, k, args.num_items)
        scores = dither_scores(raw_scores, k, dithering_eps)
        
        user_scores.append(scores)
        
    return th.stack(user_scores)

In [24]:
pipeline_metrics = model.compute_validation_metrics(
    val_dataloader,
    full_pipeline
)

print(f"\nPrecision: {pipeline_metrics['precision']:.4f}")
print(f"Recall: {pipeline_metrics['recall']:.4f}")
print(f"NDCG: {pipeline_metrics['ndcg']:.4f}")

User 0

  candidates = th.tensor(neighbor_indices).flatten().unique()


User 138000
Precision: 0.0120
Recall: 0.2878
NDCG: 0.1056


### Ideal Retrieval

In [25]:
def ideal_candidates(index, user_embedding, user_id, k, num_items):
#     print(user_id.item())
#     print(val_dataloader.dataset[user_id.item()])  

    val_interactions = val_dataloader.dataset[user_id.item()]["interactions"].coalesce()
    val_item_ids = val_interactions.indices()[1]
    
#     print(val_item_ids)
    
    if not user_embedding.isnan().any():
        neighbor_scores, neighbor_indices = index.search(np.array(user_embedding.cpu()), k)
    else:
        neighbor_indices = th.randint(num_items, (k,))
    
    neighbor_set = set(neighbor_indices.flatten().tolist())
    val_set = set(val_item_ids.flatten().tolist())
    padding_set = neighbor_set - val_set
    
    ideal_indices = list(val_set) + list(padding_set)
    
#     print(f"Neighbor set: {neighbor_set}")
#     print(f"Validation set: {val_set}")
#     print(f"Padding set: {padding_set}")
#     print(f"Combined: {ideal_indices[:10]}")
    
    candidates = th.tensor(ideal_indices).flatten().unique()
    
#     if candidates.shape[0] < k:
#         print(candidates.shape)
    
    return candidates

In [26]:
def ideal_retrieval(user_ids, num_items):
    k = 250
    dithering_eps = 1.5
    
    user_scores = []
    for user_id in user_ids:
        if user_id % 1000 == 0:
            print(f"\rUser {user_id}", sep=" ", end="", flush=True)
        
        interactions = train_dataloader.dataset[user_id]["interactions"].coalesce()
        item_ids = interactions.indices()[1]
        
        item_embeddings, user_embedding, user_avg_embedding = \
                fetch_embeddings(model, user_id, item_ids)
        
        candidates = ideal_candidates(approx_index, user_avg_embedding, user_id, k, args.num_items)
        filtered = filter_candidates(user_id.cpu().item(), candidates)
        raw_scores = score_candidates(model, user_avg_embedding, filtered, k, num_items)
        scores = dither_scores(raw_scores, k, dithering_eps)
        
        user_scores.append(scores)
        
    return th.stack(user_scores)

In [27]:
ideal_retrieval_metrics = model.compute_validation_metrics(
    val_dataloader,
    ideal_retrieval
)

print(f"\nPrecision: {ideal_retrieval_metrics['precision']:.4f}")
print(f"Recall: {ideal_retrieval_metrics['recall']:.4f}")
print(f"NDCG: {ideal_retrieval_metrics['ndcg']:.4f}")

User 138000
Precision: 0.0148
Recall: 0.3287
NDCG: 0.1193


### Ideal Filtering

In [28]:
def ideal_filtering(user_ids, num_items):
    k = 250
    dithering_eps = 1.5
    
    user_scores = []
    for user_id in user_ids:
        if user_id % 1000 == 0:
            print(f"\rUser {user_id}", sep=" ", end="", flush=True)
        
        interactions = train_dataloader.dataset[user_id]["interactions"].coalesce()
        item_ids = interactions.indices()[1]
        
        item_embeddings, user_embedding, user_avg_embedding = \
                fetch_embeddings(model, user_id, item_ids)
        
        candidates = user_embedding_candidates(approx_index, user_avg_embedding, k, args.num_items)
#         filtered = filter_candidates(user_id.cpu().item(), candidates)
        raw_scores = score_candidates(model, user_avg_embedding, candidates, k, args.num_items)
        scores = dither_scores(raw_scores, k, dithering_eps)
        
        user_scores.append(scores)
        
    return th.stack(user_scores)

In [29]:
ideal_filtering_metrics = model.compute_validation_metrics(
    val_dataloader,
    ideal_filtering
)

print(f"\nPrecision: {ideal_filtering_metrics['precision']:.4f}")
print(f"Recall: {ideal_filtering_metrics['recall']:.4f}")
print(f"NDCG: {ideal_filtering_metrics['ndcg']:.4f}")

User 0

  candidates = th.tensor(candidates, dtype=th.long)
  candidates = th.tensor(neighbor_indices).flatten().unique()


User 138000
Precision: 0.0183
Recall: 0.3351
NDCG: 0.1260


### Ideal Scoring

In [30]:
def ideal_scoring(user_ids, num_items):
    k = 250
    dithering_eps = 1.5
    
    user_scores = []
    for user_id in user_ids:
        if user_id % 1000 == 0:
            print(f"\rUser {user_id}", sep=" ", end="", flush=True)
        
        interactions = train_dataloader.dataset[user_id]["interactions"].coalesce()
        item_ids = interactions.indices()[1]
        
        item_embeddings, user_embedding, user_avg_embedding = \
                fetch_embeddings(model, user_id, item_ids)
        
        candidates = user_embedding_candidates(approx_index, user_avg_embedding, k, args.num_items)
        filtered = filter_candidates(user_id.cpu().item(), candidates)
        raw_scores = score_candidates(model, user_avg_embedding, filtered, k, args.num_items)
        
        # Move interacted items from validation set to the top
        val_interactions = val_dataloader.dataset[user_id]["interactions"].coalesce()
        val_item_ids = val_interactions.indices()[1]
        boosted_scores = raw_scores.clone().detach()
        boosted_scores[val_item_ids] += 10.0
        
        scores = dither_scores(boosted_scores, k, dithering_eps)
        
        user_scores.append(scores)
        
    return th.stack(user_scores)

In [31]:
ideal_scoring_metrics = model.compute_validation_metrics(
    val_dataloader,
    ideal_scoring
)

print(f"\nPrecision: {ideal_scoring_metrics['precision']:.4f}")
print(f"Recall: {ideal_scoring_metrics['recall']:.4f}")
print(f"NDCG: {ideal_scoring_metrics['ndcg']:.4f}")

User 0

  candidates = th.tensor(neighbor_indices).flatten().unique()


User 138000
Precision: 0.0174
Recall: 0.4300
NDCG: 0.4502


### Ideal Ordering

In [32]:
def ideal_ordering(user_ids, num_items):
    k = 250
    dithering_eps = 1.5
    
    user_scores = []
    for user_id in user_ids:
        if user_id % 1000 == 0:
            print(f"\rUser {user_id}", sep=" ", end="", flush=True)
        
        interactions = train_dataloader.dataset[user_id]["interactions"].coalesce()
        item_ids = interactions.indices()[1]
        
        item_embeddings, user_embedding, user_avg_embedding = \
                fetch_embeddings(model, user_id, item_ids)
        
        candidates = user_embedding_candidates(approx_index, user_avg_embedding, k, args.num_items)
        filtered = filter_candidates(user_id.cpu().item(), candidates)
        raw_scores = score_candidates(model, user_avg_embedding, filtered, k, args.num_items)
#         scores = dither_scores(raw_scores, k, dithering_eps)
        
        user_scores.append(raw_scores)
        
    return th.stack(user_scores)

In [33]:
ideal_ordering_metrics = model.compute_validation_metrics(
    val_dataloader,
    ideal_ordering
)

print(f"\nPrecision: {ideal_ordering_metrics['precision']:.4f}")
print(f"Recall: {ideal_ordering_metrics['recall']:.4f}")
print(f"NDCG: {ideal_ordering_metrics['ndcg']:.4f}")

User 0

  candidates = th.tensor(neighbor_indices).flatten().unique()


User 138000
Precision: 0.0132
Recall: 0.3159
NDCG: 0.1144


### Improved Retrieval

In [34]:
def item_embedding_candidates(index, item_embeddings, k, num_items):
    if len(item_embeddings) > 0:
        neighbors_per = max(1, k // max(1,len(item_embeddings)))
        neighbor_scores, neighbor_indices = index.search(np.array(item_embeddings.cpu()), neighbors_per)
    else:
        neighbor_indices = th.randint(num_items, (k,))
        
    return th.tensor(neighbor_indices).flatten().unique()

In [35]:
def item_based_retrieval(user_ids, num_items):
    k = 250
    dithering_eps = 1.5
    
    user_scores = []
    for user_id in user_ids:
        if user_id % 1000 == 0:
            print(f"\rUser {user_id}", sep=" ", end="", flush=True)
        
        interactions = train_dataloader.dataset[user_id]["interactions"].coalesce()
        item_ids = interactions.indices()[1]
        
        item_embeddings, user_embedding, user_avg_embedding = \
                fetch_embeddings(model, user_id, item_ids)
        
        candidates = item_embedding_candidates(approx_index, item_embeddings, k, args.num_items)
        filtered = filter_candidates(user_id.cpu().item(), candidates)
        raw_scores = score_candidates(model, user_avg_embedding, filtered, k, args.num_items)
        scores = dither_scores(raw_scores, k, dithering_eps)
        
        user_scores.append(scores)
        
    return th.stack(user_scores)

In [36]:
item_based_retrieval_metrics = model.compute_validation_metrics(
    val_dataloader,
    item_based_retrieval
)

print(f"\nPrecision: {item_based_retrieval_metrics['precision']:.4f}")
print(f"Recall: {item_based_retrieval_metrics['recall']:.4f}")
print(f"NDCG: {item_based_retrieval_metrics['ndcg']:.4f}")

User 0

  return th.tensor(neighbor_indices).flatten().unique()


User 138000
Precision: 0.0069
Recall: 0.1966
NDCG: 0.0743


In [37]:
def item_based_retrieval_ideal_filtering(user_ids, num_items):
    k = 250
    dithering_eps = 1.5
    
    user_scores = []
    for user_id in user_ids:
        if user_id % 1000 == 0:
            print(f"\rUser {user_id}", sep=" ", end="", flush=True)
        
        interactions = train_dataloader.dataset[user_id]["interactions"].coalesce()
        item_ids = interactions.indices()[1]
        
        item_embeddings, user_embedding, user_avg_embedding = \
                fetch_embeddings(model, user_id, item_ids)
        
        candidates = item_embedding_candidates(approx_index, item_embeddings, k, args.num_items)
#         filtered = filter_candidates(user_id.cpu().item(), candidates)
        raw_scores = score_candidates(model, user_avg_embedding, candidates, k, args.num_items)
        scores = dither_scores(raw_scores, k, dithering_eps)
        
        user_scores.append(scores)
        
    return th.stack(user_scores)

In [38]:
item_based_retrieval_ideal_filtering_metrics = model.compute_validation_metrics(
    val_dataloader,
    item_based_retrieval_ideal_filtering
)

print(f"\nPrecision: {item_based_retrieval_ideal_filtering_metrics['precision']:.4f}")
print(f"Recall: {item_based_retrieval_ideal_filtering_metrics['recall']:.4f}")
print(f"NDCG: {item_based_retrieval_ideal_filtering_metrics['ndcg']:.4f}")

User 0

  candidates = th.tensor(candidates, dtype=th.long)
  return th.tensor(neighbor_indices).flatten().unique()


User 138000
Precision: 0.0086
Recall: 0.2195
NDCG: 0.0827


In [39]:
def item_based_retrieval_ideal_scoring(user_ids, num_items):
    k = 250
    dithering_eps = 1.5
    
    user_scores = []
    for user_id in user_ids:
        if user_id % 1000 == 0:
            print(f"\rUser {user_id}", sep=" ", end="", flush=True)
        
        interactions = train_dataloader.dataset[user_id]["interactions"].coalesce()
        item_ids = interactions.indices()[1]
        
        item_embeddings, user_embedding, user_avg_embedding = \
                fetch_embeddings(model, user_id, item_ids)
        
        candidates = item_embedding_candidates(approx_index, item_embeddings, k, args.num_items)
        filtered = filter_candidates(user_id.cpu().item(), candidates)
        raw_scores = score_candidates(model, user_avg_embedding, filtered, k, args.num_items)
        scores = dither_scores(raw_scores, k, dithering_eps)
        
        # Move interacted items from validation set to the top
        val_interactions = val_dataloader.dataset[user_id]["interactions"].coalesce()
        val_item_ids = val_interactions.indices()[1]
        boosted_scores = raw_scores.clone().detach()
        boosted_scores[val_item_ids] += 10.0
        
        user_scores.append(scores)
        
    return th.stack(user_scores)

In [None]:
item_based_retrieval_ideal_scoring_metrics = model.compute_validation_metrics(
    val_dataloader,
    item_based_retrieval_ideal_scoring
)

print(f"\nPrecision: {item_based_retrieval_ideal_scoring_metrics['precision']:.4f}")
print(f"Recall: {item_based_retrieval_ideal_scoring_metrics['recall']:.4f}")
print(f"NDCG: {item_based_retrieval_ideal_scoring_metrics['ndcg']:.4f}")

User 0

  return th.tensor(neighbor_indices).flatten().unique()


User 133000

In [None]:
def item_based_retrieval_ideal_ordering(user_ids, num_items):
    k = 250
    dithering_eps = 1.5
    
    user_scores = []
    for user_id in user_ids:
        if user_id % 1000 == 0:
            print(f"\rUser {user_id}", sep=" ", end="", flush=True)
        
        interactions = train_dataloader.dataset[user_id]["interactions"].coalesce()
        item_ids = interactions.indices()[1]
        
        item_embeddings, user_embedding, user_avg_embedding = \
                fetch_embeddings(model, user_id, item_ids)
        
        candidates = item_embedding_candidates(approx_index, item_embeddings, k, args.num_items)
        filtered = filter_candidates(user_id.cpu().item(), candidates)
        raw_scores = score_candidates(model, user_avg_embedding, filtered, k, args.num_items)
#         scores = dither_scores(raw_scores, k, dithering_eps)
        
        user_scores.append(raw_scores)
        
    return th.stack(user_scores)

In [None]:
item_based_retrieval_ideal_ordering_metrics = model.compute_validation_metrics(
    val_dataloader,
    item_based_retrieval_ideal_ordering
)

print(f"\nPrecision: {item_based_retrieval_ideal_ordering_metrics['precision']:.4f}")
print(f"Recall: {item_based_retrieval_ideal_ordering_metrics['recall']:.4f}")
print(f"NDCG: {item_based_retrieval_ideal_ordering_metrics['ndcg']:.4f}")

### Improved Filtering

In [None]:
# TODO: Try out better Bloom filter parameters

### Improved Scoring

In [None]:
# TODO: Try out using the learned user embeddings for scoring

### Improved Ordering

In [None]:
# TODO: Try reducing the dithering eps