Matrix Factorization Compute
===

Collaborative filtering.

Rendle: https://arxiv.org/pdf/2005.09683.pdf

Fast.ai implementation: https://github.com/fastai/fastbook/blob/master/08_collab.ipynb

In [None]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [None]:
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt

In [None]:
import os
import re
import json
import sys
import pickle
from tqdm import tqdm
from collections import Counter

import sklearn
import sklearn.linear_model
import sklearn.preprocessing
from sklearn.pipeline import Pipeline

import dateutil.parser
from dateutil.relativedelta import relativedelta
from datetime import datetime, timedelta
import pytz

In [None]:
# HuggingFace packages
import transformers
import tokenizers
import torch

# more torch imports
import torchvision
import torchvision.transforms
import torch.optim as optim
import torch.nn as nn
import torch.nn.functional as F

In [None]:
# evaluation
from scipy.stats import rankdata

In [None]:
from pathlib import Path
git_root_dir = !git rev-parse --show-toplevel
git_root_dir = Path(git_root_dir[0].strip())
git_root_dir

In [None]:
import sys
sys.path.append(os.path.join(git_root_dir, 'src'))
import cbrec.genconfig

In [None]:
config = cbrec.genconfig.Config()
#config.metadata_filepath += "_old"
#config.feature_db_filepath += "_old"

In [None]:
import cbrec.featuredb
import cbrec.utils
import cbrec.data
import cbrec.reccontext
import cbrec.evaluation
import cbrec.torchmodel
import cbrec.text.embeddingdb
import cbrec.text.journalid

In [None]:
import cbrec.logutils
cbrec.logutils.set_up_logging()

In [None]:
# turn off matplotlib logging
import logging
logging.getLogger('matplotlib').setLevel(logging.WARNING)

In [None]:
import sys
caringbridge_core_path = "/home/lana/levon003/repos/caringbridge_core"
sys.path.append(caringbridge_core_path)
import cbcore.data.paths

## Load train data

In [None]:
md_list = [md for md in cbrec.utils.stream_metadata_list(config.metadata_filepath)]
len(md_list)

In [None]:
train_md_list = [md for md in md_list if md['type'] == 'train']
len(train_md_list)

In [None]:
# add full USP info to MD list
train_inds = {}
for i, md in enumerate(train_md_list):
    train_inds[md['metadata_id']] = i

config = cbrec.genconfig.Config()
db = cbrec.featuredb.get_db_by_filepath(config.feature_db_filepath)
with db:
    command = """
    SELECT 
        interaction_timestamp, 
        metadata_id,
        source_user_id,
        source_site_id,
        target_user_id,
        target_site_id,
        alt_user_id,
        alt_site_id
    FROM triple
    ORDER BY RANDOM()
    """
    cursor = db.execute(command)
    if cursor is None:
        raise ValueError("Null cursor.")
    row = cursor.fetchone()
    while row is not None:
        i = train_inds[row['metadata_id']]
        assert row['source_user_id'] == train_md_list[i]['source_user_id']
        train_md_list[i].update(row)
        row = cursor.fetchone()


In [None]:
train_md_list[0].keys()

## USP approach

In [None]:
class NeuMF(nn.Module):
    """
    Derived from https://github.com/fastai/fastbook/blob/master/08_collab.ipynb
    """
    def __init__(self, n_source, n_candidate, n_factors):
        self.source_factors = nn.Embedding(n_source, n_factors)
        self.source_bias = nn.Embedding(n_source, 1)
        self.candidate_factors = nn.Embedding(n_candidate, n_factors)
        self.candidate_bias = nn.Embedding(n_candidate, 1)
        
    def forward(self, x):
        sources = self.source_factors(x[:,0])
        candidates = self.candidate_factors(x[:,1])
        res = (sources * candidates).sum(dim=1, keepdim=True)
        res += self.source_bias(x[:,0]) + self.candidate_bias(x[:,1])
        return res

In [None]:
from collections import Counter
source_c = Counter()
target_c = Counter()
alt_c = Counter()
for md in train_md_list:
    source_usp = (md['source_user_id'], md['source_site_id'])
    target_usp = (md['target_user_id'], md['target_site_id'])
    alt_usp = (md['alt_user_id'], md['alt_site_id'])
    source_c.update([source_usp,])
    target_c.update([target_usp,])
    alt_c.update([alt_usp,])

In [None]:
v_counts_list = []
for c in [source_c, target_c, alt_c]:
    vcounts = pd.Series([count for usp, count in c.most_common()]).value_counts()
    v_counts_list.append(vcounts)
    print(f"total={vcounts.sum()} total>1={vcounts[vcounts.index > 1].sum()} unique init counts={len(vcounts)} max inits={vcounts.index.max()} # w 1 inits={vcounts[1]}")

In [None]:
# assign every USP an integer ID
source_usp_id_map = {}
source_i = 1
candidate_usp_id_map = {}
candidate_i = 1

for md in train_md_list:
    source_usp = (md['source_user_id'], md['source_site_id'])
    target_usp = (md['target_user_id'], md['target_site_id'])
    alt_usp = (md['alt_user_id'], md['alt_site_id'])
    
    if source_c[source_usp] == 1:
        md['source_usp_id'] = 0
    else:
        if source_usp not in source_usp_id_map:
            source_usp_id_map[source_usp] = source_i
            source_i += 1
        md['source_usp_id'] = source_usp_id_map[source_usp]
        
    if target_c[target_usp] == 1:  # + alt_c[target_usp] == 1:
        md['target_usp_id'] = 0
    else:
        if target_usp not in candidate_usp_id_map:
            candidate_usp_id_map[target_usp] = candidate_i
            candidate_i += 1
        md['target_usp_id'] = candidate_usp_id_map[target_usp]
        
    if alt_c[alt_usp] == 1:
        md['alt_usp_id'] = 0
    else:
        if alt_usp not in candidate_usp_id_map:
            candidate_usp_id_map[alt_usp] = candidate_i
            candidate_i += 1
        md['alt_usp_id'] = candidate_usp_id_map[alt_usp]
        

In [None]:
pd.Series([md['source_usp_id'] for md in train_md_list]).value_counts()

In [None]:
pd.Series([md['target_usp_id'] for md in train_md_list]).value_counts()

In [None]:
pd.Series([md['alt_usp_id'] for md in train_md_list]).value_counts()

## User -> Site approach

In [None]:
include_alt = True
MIN_OCCURENCE_COUNT = 1

user_counter = Counter()
site_counter = Counter()
for md in train_md_list:
    user_counter.update([md['source_user_id'],])
    sites = [md['target_site_id'],]
    if include_alt:
        assert md['target_site_id'] != md['alt_site_id']
        sites.append(md['alt_site_id'])
    site_counter.update(sites)
print(f"n_users_total={len(user_counter)}; n_sites_total={len(site_counter)}")

# assign every user and site an integer ID
user_id_map = {}
user_i = 1
site_id_map = {}
site_i = 1

for md in train_md_list:
    source_user_id = md['source_user_id']
    target_site_id = md['target_site_id']
    alt_site_id = md['alt_site_id']
    
    if user_counter[source_user_id] <= MIN_OCCURENCE_COUNT:
        md['source_user_emb_id'] = 0
    else:
        if source_user_id not in user_id_map:
            user_id_map[source_user_id] = user_i
            user_i += 1
        md['source_user_emb_id'] = user_id_map[source_user_id]
    
    site_ids = [('target', target_site_id),]
    if include_alt:
        site_ids.append(('alt', alt_site_id))
    for key, site_id in site_ids:
        if site_counter[site_id] <= MIN_OCCURENCE_COUNT:
            md[key+'_site_emb_id'] = 0
        else:
            if site_id not in site_id_map:
                site_id_map[site_id] = site_i
                site_i += 1
            md[key+'_site_emb_id'] = site_id_map[site_id]

n_users = len(user_id_map)
n_sites = len(site_id_map)
print(f"{n_users=}; {n_sites=}")
source_counts = pd.Series([md['source_user_emb_id'] for md in train_md_list]).value_counts()
assert len(source_counts) == len(user_id_map) + 1  # +1, as 0 isn't in the dictionary

In [None]:
class NeuMF(nn.Module):
    """
    Derived from https://github.com/fastai/fastbook/blob/master/08_collab.ipynb
    
    Note: this is NOT the "NeuMF" model, and I regret calling it that. This is just the plain Probabilistic Matrix Factorization approach to Collaborative Filtering
    """
    def __init__(self, n_users, n_sites, n_factors):
        super(NeuMF, self).__init__()
        self.user_factors = nn.Embedding(n_users, n_factors)
        self.user_bias = nn.Embedding(n_users, 1)
        self.site_factors = nn.Embedding(n_sites, n_factors)
        self.site_bias = nn.Embedding(n_sites, 1)
        
    def forward(self, x):
        users = self.user_factors(x[:,0])
        sites = self.site_factors(x[:,1])
        res = (users * sites).sum(dim=1, keepdim=True)
        res += self.user_bias(x[:,0]) + self.site_bias(x[:,1])
        return res

In [None]:
X_train = []
y_train = []
#def get_train_tuple(user_emb_id, site_emb_id):
#    return (user_id_map[user_emb_id] if user_emb_id in user_id_map else 0,
#     site_id_map[site_emb_id] if site_emb_id in site_id_map else 0)

for md in train_md_list:
    X_train.append((md['source_user_emb_id'], md['target_site_emb_id']))
    y_train.append(1)
    if include_alt:
        X_train.append((md['source_user_emb_id'], md['alt_site_emb_id']))
        y_train.append(0)
X = np.array(X_train)
y_true = np.array(y_train)
X.shape, y_true.shape

In [None]:
# total zeros in X
(X == 0).sum() / (X.shape[0] * X.shape[1])

In [None]:
# convert types
#X, y_true = X.astype('float64'), y_true.astype('float64')
y_true = y_true.astype('float32')

In [None]:
n_train = int(np.ceil(len(y_true) * 0.99))
X_train = X[:n_train,:]
X_test = X[n_train:,:]
y_train = y_true[:n_train]
y_test = y_true[n_train:]
X_train.shape, X_test.shape

In [None]:
import logging
def train_model(config, X_train, y_train, X_test, y_test):
    """
    An adapted form of the implementation from cbrec.modeling.train.train_model
    """
    logger = logging.getLogger("cbrec.modeling.train.train_model")

    n_train = len(y_train)
    n_test = len(y_test)

    minibatch_size = n_train
    if hasattr(config, "minibatch_size"):
        minibatch_size = config.minibatch_size
        logger.info(f"Using minibatch size {minibatch_size}.")
    minibatch_size = min(n_train, minibatch_size)  # if minibatch_size is larger than n_train, force it to n_train
    n_minibatches = int(np.ceil(n_train / minibatch_size))

    # create the net
    net = NeuMF(config.mf_n_users, config.mf_n_sites, config.mf_n_factors)

    n_epochs = config.train_n_epochs
    criterion = nn.BCEWithLogitsLoss()  # pointwise loss function

    X_test_tensor = torch.from_numpy(X_test)
    y_test_tensor = torch.from_numpy(y_test)
    X_train_tensor = torch.from_numpy(X_train)
    y_train_tensor = torch.from_numpy(y_train)
    y_train_tensor = y_train_tensor.view(-1, 1)  # make labels 2-dimensional
    #y_train_tensor = y_train_tensor.type_as(X_train_tensor)
    if config.train_verbose:
        logger.info(f"Input tensor sizes: {X_train_tensor.size()}, {y_train_tensor.size()}")
        logger.info(f"Validating model every {int(1/config.train_validation_rate)} epochs for {n_epochs} epochs.")

    # _metrics[0] -> Epoch, metrics[1] -> loss, _metrics[2] -> accuracy
    test_metrics = np.zeros((3,int(n_epochs*config.train_validation_rate+1))) #+1 to ensure space for final epoch metric
    train_metrics = np.zeros((3,n_epochs))

    assert n_epochs > 0
    optimizer = optim.Adam(net.parameters(),
                           lr=config.train_lr_init,
                           betas=(config.train_Adam_beta1, config.train_Adam_beta2),
                           eps=config.train_Adam_eps,
                           weight_decay=config.train_weight_decay)
    scheduler = optim.lr_scheduler.OneCycleLR(
        optimizer,
        max_lr=config.train_max_lr,
        steps_per_epoch=n_minibatches,
        epochs=n_epochs,
    )
    for epoch in range(n_epochs):
        s = datetime.now()
        optimizer.zero_grad()

        # shuffle the training data
        # I am not sure if this matters at all
        epoch_order = torch.randperm(n_train)

        mb_metrics = []  # store the minibatch_metrics, then average after
        for minibatch in range(n_minibatches):
            minibatch_start = minibatch * minibatch_size
            minibatch_end = min(minibatch_start + minibatch_size, n_train)
            if config.train_verbose and epoch == 0:
                logger.info(f"    Minibatch for inds in {minibatch_start} - {minibatch_end}.")
            minibatch_inds = epoch_order[minibatch_start:minibatch_end]

            inputs = X_train_tensor[minibatch_inds]
            train_labels = y_train_tensor[minibatch_inds]

            net.train()
            train_outputs = net(inputs)
            train_loss = criterion(train_outputs, train_labels)
            train_loss.backward()
            optimizer.step()
            if scheduler is not None:
                scheduler.step()

            # compute accuracy
            y_train_pred = torch.sigmoid(train_outputs.detach()).view((-1,)).numpy()
            y_train_pred = (y_train_pred >= 0.5).astype(int)  # binarize predictions with a 0.5 decision boundary
            y_train_minibatch = y_train[minibatch_inds.numpy()]
            train_acc = np.sum(y_train_pred == y_train_minibatch) / len(y_train_minibatch)

            mb_metrics.append((train_loss.item(), train_acc))
        train_loss, train_acc = np.mean(np.array(mb_metrics), axis=0)
        train_metrics[0,epoch] = epoch
        train_metrics[1,epoch] = train_loss
        train_metrics[2,epoch] = train_acc

        should_stop_early = train_loss < 0.001
        if config.train_verbose and (epoch < 3 or epoch == n_epochs - 1 or epoch % 10 == 0 or should_stop_early):
            logger.info(f"{epoch:>3} ({datetime.now() - s}): train loss={train_loss:.4f} train accuracy={train_acc*100:.2f}% LR={optimizer.param_groups[0]['lr']:.2E}")
        if should_stop_early:
            break

        if epoch % (1/config.train_validation_rate) == 0:
            net.eval()
            with torch.no_grad():
                test_outputs = net(X_test_tensor)
                test_loss = criterion(test_outputs.detach(), y_test_tensor.unsqueeze(1).float())
                y_test_pred = torch.sigmoid(test_outputs.detach()).view((-1,)).numpy()
                y_test_pred = (y_test_pred >= 0.5).astype(int)
                test_acc = np.sum(y_test_pred == y_test) / len(y_test)
            logger.info(f"    {epoch:>3}: test loss={test_loss:.4f} test accuracy={test_acc*100:.2f}%")
            metric_ind = int(epoch*config.train_validation_rate)
            if metric_ind > 0 and test_loss <= np.min(test_metrics[1,:metric_ind]):
                # this is the lowest loss we've reached
                # TODO could consider saving `net` at `epoch`.
                logger.info(f"    Best validation lost achieved so far.")
            test_metrics[0,metric_ind] = epoch
            test_metrics[1,metric_ind] = test_loss
            test_metrics[2,metric_ind] = test_acc

    if config.train_verbose and n_epochs > 0:
        final_train_loss = train_loss
        final_epoch_count = epoch + 1
        logger.info(f"Completed {final_epoch_count} epochs with a final train loss of {final_train_loss:.4f}.")

    net.eval()
    with torch.no_grad():
        X_test_tensor = torch.from_numpy(X_test)
        outputs = net(X_test_tensor)
        test_loss = criterion(outputs.detach(), y_test_tensor.unsqueeze(1).float())
        logger.info(f"Test loss: {test_loss.item():.4f}")
        y_test_pred = torch.sigmoid(outputs.detach()).view((-1,)).numpy()
        y_test_pred = (y_test_pred >= 0.5).astype(int)
        acc = np.sum(y_test_pred == y_test) / len(y_test)
        logger.info(f"Test acc: {acc*100:.2f}%")
        test_metrics[0, test_metrics.shape[1] - 1] = epoch
        test_metrics[1, test_metrics.shape[1] - 1] = test_loss
        test_metrics[2, test_metrics.shape[1] - 1] = acc
    return net, train_metrics, test_metrics

In [None]:
import cbrec.modeling
import cbrec.modeling.modelconfig
config = cbrec.modeling.modelconfig.ModelConfig()
config.mf_n_users = len(user_id_map) + 1
config.mf_n_sites = len(site_id_map) + 1
config.mf_n_factors = 128
config.train_verbose = True
config.train_max_lr = 0.3
config.train_lr_init = 0.8
config.train_weight_decay = 0.0001
config.train_n_epochs=1000
net, train_metrics, test_metrics = train_model(config, X_train, y_train, X_test, y_test)

In [None]:
import cbrec.modeling
import cbrec.modeling.modelconfig
nets = []
for train_weight_decay in [0, 0.0001, 0.01, 0.1]:
    for mf_n_factors in [8, 16, 32, 64, 128]:   # [16, 32, 64, 96, 128, 192] is the list used in the NMF revisited paper
        config = cbrec.modeling.modelconfig.ModelConfig()
        config.mf_n_users = len(user_id_map) + 1
        config.mf_n_sites = len(site_id_map) + 1
        config.mf_n_factors = mf_n_factors
        config.train_verbose = False
        config.train_max_lr = 0.3
        config.train_lr_init = 0.8
        config.train_weight_decay = train_weight_decay
        config.train_n_epochs=100
        net, train_metrics, test_metrics = train_model(config, X_train, y_train, X_test, y_test)
        nets.append((f"n_factors={mf_n_factors}; wd={train_weight_decay}", net))

## Score models

In [None]:
try:
    import cbrec
except:
    sys.path.append("/home/lana/levon003/repos/recsys-peer-match/src")

import cbrec.featuredb
import cbrec.genconfig
import cbrec.utils
import cbrec.evaluation
import cbrec.reccontext
import cbrec.recentActivityCounter
import cbrec.modeling.text_loader
import cbrec.modeling.reccontext_builder
import cbrec.modeling.scorer
import cbrec.modeling.manager

In [None]:
test_md_list = [md for md in md_list if md['type'] == 'test' or md['type'] == 'predict']
test_md_map = {md['metadata_id']: md for md in test_md_list}
len(test_md_map)

In [None]:
VALIDATION_END_TIMESTAMP = datetime.strptime("2021-07-01", "%Y-%m-%d").timestamp() * 1000
    
config = cbrec.genconfig.Config()
db = cbrec.featuredb.get_db_by_filepath(config.feature_db_filepath)
with db:
    test_filepath = os.path.join(config.feature_data_dir, 'mf_metrics.ndjson')
    scores_filepath = os.path.join(config.feature_data_dir, 'mf_coverage_scores.pkl')
    scores = []
    net.eval()
    with open(test_filepath, 'w') as metrics_outfile, open(scores_filepath, 'wb') as scores_outfile, torch.no_grad():
        
        for test_context in tqdm(cbrec.featuredb.stream_test_contexts(db, config), desc='Streaming test contexts', total=len(test_md_map)):
            test_context_md = test_md_map[test_context['metadata_id']]
            
            rc = cbrec.reccontext.RecContext.create_from_test_context(config, test_context_md, test_context)
            has_target = rc.md['has_target']

            #if has_target and md['timestamp'] <= VALIDATION_END_TIMESTAMP:
            #    continue  # don't process validation timestamps
            
            save_scores = not has_target  # save scores if this is a prediction target (for coverage)
            scorer = cbrec.evaluation.Scorer(config, rc, save_scores=save_scores)

            #site_id_arr, _ = np.unique(coverage_rc.candidate_usp_arr[:,1], return_index=True)
            site_id_arr = scorer.site_id_arr
            X_pred = np.zeros((len(site_id_arr), 2), dtype='int64')
            X_pred[:,0] = user_id_map[rc.source_user_id] if rc.source_user_id in user_id_map else 0
            X_pred[:,1] = [
                site_id_map[site_id] if site_id in site_id_map else 0
                for site_id in site_id_arr
            ]
            rc.md['user_known'] = rc.source_user_id in user_id_map
            rc.md['n_sites_known'] = int((X_pred[:,1] != 0).sum())
            X_pred = torch.from_numpy(X_pred)
            for net_name, net in nets:
                outputs = net(X_pred)
                y_score = torch.sigmoid(outputs.detach()).view((-1,)).numpy()
                scorer.compute_metrics(y_score, net_name)
            
            rc.md['metrics'] = scorer.metrics_dict
            if save_scores:
                rc.md['scores'] = scorer.scores_dict # {key: list(value) for key, value in scorer.scores_dict.items()}
                scores.append(rc.md)
                if len(scores) == 1000:
                    pickle.dump(scores, scores_outfile)
                    logging.info(f"Saved pickle with {len(scores)} scores.")
                    scores = []
            else:
                line = json.dumps(rc.md) + "\n"
                metrics_outfile.write(line)

In [None]:
!head -n 1 /home/lana/shared/caringbridge/data/projects/recsys-peer-match/feature_data/mf_metrics.ndjson

In [None]:
VALIDATION_END_TIMESTAMP = datetime.strptime("2021-07-01", "%Y-%m-%d").timestamp() * 1000
md_list = [md for md in md_list if md['type'] == 'test' or md['type'] == 'predict']
validation_md_list = [md for md in md_list if md['has_target'] and md['timestamp'] <= VALIDATION_END_TIMESTAMP]
test_md_list = [md for md in md_list if md['has_target'] and md['timestamp'] > VALIDATION_END_TIMESTAMP]
print(len(test_md_list))
validation_metadata_ids = set([md['metadata_id'] for md in validation_md_list])
test_metadata_ids = set([md['metadata_id'] for md in test_md_list])
len(test_metadata_ids)

In [None]:
from collections import defaultdict
all_target_ranks = {'validation': defaultdict(list), 'test': defaultdict(list)}
baseline_test_filepath = os.path.join(config.feature_data_dir, 'mf_metrics.ndjson')
with open(baseline_test_filepath, 'r') as metadata_file:
    for line in tqdm(metadata_file, desc='Reading metrics', disable=False):
        md = json.loads(line)
        if md['metadata_id'] in test_metadata_ids:
            target_ranks = all_target_ranks['test']
        elif md['metadata_id'] in validation_metadata_ids:
            target_ranks = all_target_ranks['validation']
        else:
            continue
            
        metrics_dict = md["metrics"]
        for model_name, metrics in metrics_dict.items():
            target_rank = metrics['target_rank']
            target_ranks[model_name].append(target_rank)
assert len(all_target_ranks['validation']) > 0
print(all_target_ranks['validation'].keys())
len(all_target_ranks['validation'])

In [None]:
d = []
target_ranks = all_target_ranks['validation']
for model_name in target_ranks.keys():
    ranks = np.array(target_ranks[model_name])

    mrr = (1 / ranks).mean()
    hr1 = (ranks == 1).sum() / len(ranks) * 100
    hr5 = (ranks <= 5).sum() / len(ranks) * 100
    d.append({
        'model': model_name,
        'n': len(ranks),
        'mrr': mrr,
        'hr1': hr1,
        'hr5': hr5,
    })
eval_df = pd.DataFrame(d).sort_values(by='mrr', ascending=False)
eval_df

In [None]:
# printing all of these because I'm lazy, but we only report the best value 
d = []
target_ranks = all_target_ranks['test']
for model_name in target_ranks.keys():
    ranks = np.array(target_ranks[model_name])

    mrr = (1 / ranks).mean()
    hr1 = (ranks == 1).sum() / len(ranks) * 100
    hr5 = (ranks <= 5).sum() / len(ranks) * 100
    d.append({
        'model': model_name,
        'n': len(ranks),
        'mrr': mrr,
        'hr1': hr1,
        'hr5': hr5,
    })
eval_df = pd.DataFrame(d).sort_values(by='mrr', ascending=False)
eval_df

In [None]:
# TODO
# Reasonable to think that the Matrix Factorization models drop off a LOT after the training period is over, since any user/site not seen at least twice in the training period gets assigned the unknown vector.
# So, should do a time-based analysis, as was done in LinearnetResults.ipynb
# but given the lack of difference between the validation and test periods

In [None]:
class CoverageHelper:
    def __init__(self):
        pass

# load cov_helper from pickle
coverage_dir = "/home/lana/shared/caringbridge/data/projects/recsys-peer-match/feature_data/coverage"
with open(os.path.join(coverage_dir, 'cov_helper.pkl'), 'rb') as coverage_helper_file:
    cov_helper = pickle.load(coverage_helper_file)
cov_helper.__dict__.keys()

In [None]:
len(cov_helper.sites_with_previous_ints)

In [None]:
def break_ties(site_id_arr, y_score_site, sort_inds):
    """
    This implementation is terrible, although I believe it works.
    """
    highest_scores = []
    highest_score_site_ids = []
    n_ties_broken = 0
    i = 0
    while len(highest_scores) < 5:
        i += 1
        score = y_score_site[sort_inds[-i]]
        if score == y_score_site[sort_inds[-(i+1)]]:
            inds = np.flatnonzero(y_score_site == score)
            n_remaining = 5 - len(highest_scores)
            if len(inds) <= n_remaining:
                highest_scores.extend([score,]*len(inds))
                highest_score_site_ids.extend(site_id_arr[inds])
                assert len(highest_scores) == len(highest_score_site_ids)
                i += len(inds) - 1
            else:
                highest_scores.extend([score,]*n_remaining)
                subset_inds = np.random.choice(inds, size=n_remaining, replace=False)
                highest_score_site_ids.extend(site_id_arr[subset_inds])
                assert len(highest_scores) == len(highest_score_site_ids)
                n_ties_broken += 1
        else:
            highest_scores.append(score)
            highest_score_site_ids.append(site_id_arr[sort_inds[-i]])
            assert len(highest_scores) == len(highest_score_site_ids)
        if len(highest_scores) == 5:
            break
    return np.array(highest_scores), np.array(highest_score_site_ids), n_ties_broken > 0

def compute_coverage_metrics(model_coverage_scores, cov_helper):
    model_recs = defaultdict(list)
    n_ties_broken = 0
    for scores_md in model_coverage_scores:
        metadata_id = scores_md['metadata_id']
        site_id_arr = cov_helper.site_id_arr_map[metadata_id]
        for model_name, y_score_site in scores_md['scores'].items():
        
            assert y_score_site.shape == site_id_arr.shape

            # create rec batch
            sort_inds = np.argsort(y_score_site)
            # TODO need to compute ranks if there are ties; for now, we'll assume there aren't any ties
            # in the case of ties, not clear what order argsort prefers
            #ranks = rankdata(-1 * y_score_site, method='max')

            highest_scores = y_score_site[sort_inds[-(cov_helper.n+1):]]
            if len(set(highest_scores)) != len(highest_scores):
                highest_scores, highest_score_site_ids, ties_broken = break_ties(site_id_arr, y_score_site, sort_inds)
                if not np.all(highest_scores == np.flip(y_score_site[sort_inds[-5:]])):
                    print(highest_scores)
                    print(y_score_site[sort_inds[-5:]])
                    return y_score_site
                if ties_broken:
                    n_ties_broken += 1
            else:
                #highest_scores = y_score_site[sort_inds[-cov_helper.n:]]
                highest_score_site_ids = site_id_arr[sort_inds[-cov_helper.n:]]
            model_recs[model_name].append(list(highest_score_site_ids))
    print(f"{n_ties_broken=}")
    
    cov_data = []
    for model_name, recs in model_recs.items():
        recced_sites = set()
        for rec in recs:
            recced_sites.update(rec)
        nonrecced_sites = cov_helper.eligible_sites - recced_sites

        recced_inted = len(recced_sites & cov_helper.sites_with_previous_ints) / len(recced_sites)
        nonrecced_inted = len(nonrecced_sites & cov_helper.sites_with_previous_ints) / len(nonrecced_sites)

        site_ages = []
        for rec in recs:
            ages = np.array([cov_helper.timestamp - cov_helper.site_first_journal_timestamp_map[site_id] for site_id in rec])
            ages = ages / 1000 / 60 / 60 / 24 / 7  # convert to weeks
            assert np.all(ages > 0)
            site_ages.append({
                'min': ages.min(),
                #'mean': ages.mean(),
                #'std': ages.std(),
                'median': np.median(ages),
                #'max': ages.max(),
            })
        mean_min_age = np.mean([a['min'] for a in site_ages])
        mean_median_age = np.mean([a['median'] for a in site_ages])

        cov_data.append({
            'model': model_name,
            'n_recced_sites': len(recced_sites),
            'n_nonrecced_sites': len(nonrecced_sites),
            'pct_eligible_recced': len(recced_sites) / len(cov_helper.eligible_sites),
            'pct_unique_recs': len(recced_sites) / (5 * 1000),
            'pct_recced_with_int': recced_inted,
            'pct_nonrecced_with_int': nonrecced_inted,
            'pct_recced_without_int': 1 - recced_inted,
            'pct_nonrecced_without_int': 1 - nonrecced_inted,
            'ratio_int': recced_inted / nonrecced_inted,
            'ratio_noint': (1 - recced_inted) / (1 - nonrecced_inted),
            'mean_min_age': mean_min_age,
            'mean_median_age': mean_median_age,
        })
    return cov_data

In [None]:
baseline_scores_filepath = os.path.join(config.feature_data_dir, 'mf_coverage_scores.pkl')
with open(baseline_scores_filepath, 'rb') as scores_file:
    scores = pickle.load(scores_file)
len(scores)

In [None]:
coverage_metrics = compute_coverage_metrics(scores, cov_helper)
len(coverage_metrics)

In [None]:
pd.DataFrame(coverage_metrics)

In [None]:
edf = pd.DataFrame(coverage_metrics)
for r in edf.itertuples():
    print(f"{r.model} &  &  &  & {r.n_recced_sites} & {r.pct_unique_recs:.2%} & {r.mean_min_age:.1f} weeks & {r.pct_recced_without_int:.1%} / {r.pct_nonrecced_without_int:.1%} = {r.ratio_noint:.2f} \\\\".replace("%", "\\%"))

In [None]:
edf = eval_df.merge(pd.DataFrame(coverage_metrics), on='model')
edf

In [None]:
# print in latex table form
for r in edf.itertuples():
    print(f"{r.model} & {r.mrr:.3f} & {r.hr1:.2f}% & {r.hr5:.2f}% & {r.n_recced_sites} & {r.pct_unique_recs:.1%} & {r.mean_min_age:.1f} weeks & {r.pct_recced_without_int:.1%} / {r.pct_nonrecced_without_int:.1%} = {r.ratio_noint:.2f} \\\\".replace("%", "\\%"))